References
NOTE:
lgb.Dataset) than pd.DataFramelgb.cv runs faster than sklearn.model_selection.cross_val_scorehyperopt is usually faster than optuna with similar accuracy.import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install catboost
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px
# modelling
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold
# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
json 2.0.9 plotly_express 0.4.1 xgboost 1.2.0 catboost 0.23.2 pandas 1.1.4 autopep8 1.5.2 numpy 1.19.4 seaborn 0.11.0 joblib 0.17.0 lightgbm 2.3.1
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
| 5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
| 5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)
df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)
df_Xtrain.head(2)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
| 1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['tenure', 'MonthlyCharges', 'TotalCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num = ['TotalCharges','tenure', 'MonthlyCharges']
cols_num_old = cols_num
cols_cat_old = cols_cat
def combine_two_features(dfx,A,B):
dfx = dfx.copy()
assert len(A) == len(B)
for a,b in zip(A,B):
dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]
return dfx
combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
'TechSupport','PaymentMethod']
cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]
cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)
df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)
['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']
def create_groupby_features(dfx,cat,num,agg):
dfx = dfx.copy()
for c in cat:
for n in num:
for a in agg:
name = f"{c}_{n}_{a}"
dfx[name] = df_train.groupby(c)[n].transform(a)
return dfx
# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']
cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']
cols_num_new = [f'{c}_{n}_{a}'
for c in cols_grpcat
for n in cols_grpnum
for a in cols_grpagg]
cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)
df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)
['Contract_TotalCharges_mean']
df_Xtrain.head(2)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
| 1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No_No | No_No | No_No | No_Month-to-month | No_Yes | No_Bank transfer (automatic) | 1370.923131 |
cols_drop = ['gender']
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)
all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
for i in cols_cat]
# make sure no nans
df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()
# one hot encode
df_Xtrain_full = pd.get_dummies(df_Xtrain_full,columns=cols_cat)
df_Xtest = pd.get_dummies(df_Xtest,columns=cols_cat)
df_Xtrain_full.head()
| tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | SeniorCitizen_No | SeniorCitizen_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Contract_Month-to-month | Contract_One year | Contract_Two year | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | PhoneService_No | PhoneService_Yes | PaperlessBilling_No | PaperlessBilling_Yes | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | Dependents_No | Dependents_Yes | Partner_No | Partner_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | 106.05 | 3834.40 | 3683.643192 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 1 | 10 | 62.25 | 612.95 | 1370.923131 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 2 | 25 | 19.15 | 477.60 | 1370.923131 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
| 3 | 7 | 20.00 | 137.60 | 1370.923131 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 4 | 24 | 20.30 | 459.95 | 1370.923131 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
# check if all nunique >= 2
df_Xtrain_full.apply(pd.Series.nunique).nsmallest(5)
Partner_Dependents_No_No 2 Partner_Dependents_No_Yes 2 Partner_Dependents_Yes_No 2 Partner_Dependents_Yes_Yes 2 SeniorCitizen_No 2 dtype: int64
# check if all nunique >= 2
df_Xtest.apply(pd.Series.nunique).nsmallest(5)
Partner_Dependents_No_No 2 Partner_Dependents_No_Yes 2 Partner_Dependents_Yes_No 2 Partner_Dependents_Yes_Yes 2 SeniorCitizen_No 2 dtype: int64
# check if all are numbers
df_Xtrain_full.sum().sum(), df_Xtest.sum().sum()
(26266765.14999999, 6655873.894557063)
# check for nans
df_Xtrain_full.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_full, ser_ytrain_full,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_full)
Xtrain_full = df_Xtrain_full.to_numpy()
Xtrain = df_Xtrain.to_numpy()
Xvalid = df_Xvalid.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (5634, 21) df_Xtrain : (4507, 77) ser_ytrain : (4507,) df_Xvalid : (1127, 77) ser_yvalid : (1127,) df_test : (1409, 21) ser_ytest : This does not exist.
| tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | SeniorCitizen_No | SeniorCitizen_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Contract_Month-to-month | Contract_One year | Contract_Two year | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | PhoneService_No | PhoneService_Yes | PaperlessBilling_No | PaperlessBilling_Yes | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | Dependents_No | Dependents_Yes | Partner_No | Partner_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4555 | 16 | 19.75 | 294.90 | 1370.923131 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 3379 | 72 | 64.70 | 4746.05 | 3683.643192 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
lgb.Dataset(
label = None,
reference = None,
weight = None,
group = None,
init_score = None,
silent = False,
feature_name = 'auto',
categorical_feature = 'auto',
params = None,
free_raw_data = True,
)
show_methods(lgb)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Booster | basic | engine | plotting |
| 1 | Dataset | callback | libpath | print_evaluation |
| 2 | LGBMClassifier | compat | os | record_evaluation |
| 3 | LGBMModel | create_tree_digraph | plot_importance | reset_parameter |
| 4 | LGBMRanker | cv | plot_metric | sklearn |
| 5 | LGBMRegressor | dir_path | plot_split_value_histogram | train |
| 6 | absolute_import | early_stopping | plot_tree | version_file |
show_methods(lgb.Dataset)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | add_features_from | get_group | num_data | set_group |
| 1 | construct | get_init_score | num_feature | set_init_score |
| 2 | create_valid | get_label | save_binary | set_label |
| 3 | get_data | get_monotone_constraints | set_categorical_feature | set_reference |
| 4 | get_feature_penalty | get_ref_chain | set_feature_name | set_weight |
| 5 | get_field | get_weight | set_field | subset |
dtrain_full = lgb.Dataset(df_Xtrain_full, ytrain_full,free_raw_data=False)
dtrain = lgb.Dataset(df_Xtrain, ytrain,free_raw_data=False)
dvalid = lgb.Dataset(df_Xvalid, yvalid,free_raw_data=False, reference=dtrain)
# we don't need dtest
lgb.LGBMClassifier(
boosting_type = 'gbdt',
num_leaves = 31,
max_depth = -1,
learning_rate = 0.1,
n_estimators = 100,
subsample_for_bin = 200000,
objective = None,
class_weight = None,
min_split_gain = 0.0,
min_child_weight = 0.001,
min_child_samples = 20,
subsample = 1.0,
subsample_freq = 0,
colsample_bytree = 1.0,
reg_alpha = 0.0,
reg_lambda = 0.0,
random_state = None,
n_jobs = -1,
silent = True,
importance_type = 'split',
**kwargs,
)
--------------------------- model.fit
model.fit(
sample_weight = None,
init_score = None,
eval_set = None,
eval_names = None,
eval_sample_weight = None,
eval_class_weight = None,
eval_init_score = None,
eval_metric = None,
early_stopping_rounds = None,
verbose = True,
feature_name = 'auto',
categorical_feature = 'auto',
callbacks = None
)
model_name = 'lightgbm'
hpo_name = 'hyperopt'
from lightgbm import LGBMClassifier
# https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
metric_profit_name = 'profit'
def metric_profit_bst(log_odds, dtrain):
y_true = dtrain.get_label()
y_true = np.array(y_true).astype(int)
y_prob = 1. / (1. + np.exp(-log_odds))
y_pred = np.rint(y_prob)
profit = get_profit(y_true,y_pred)
greater_is_better = True
return 'profit', profit, greater_is_better
metric_profit_name = 'profit'
def metric_profit(y_true, y_prob):
y_true = np.array(y_true).astype(int)
y_pred = np.rint(y_prob)
profit = get_profit(y_true,y_pred)
greater_is_better = True
return metric_profit_name, profit, greater_is_better
params = dict(random_state=SEED,n_estimators=1000)
model = LGBMClassifier(**params)
model.fit(df_Xtrain,ytrain,
eval_set=(df_Xvalid, ser_yvalid),
eval_metric = metric_profit,
early_stopping_rounds=20,
verbose=0,
)
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')
model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)
test profit = $-21,200
precision recall f1-score support
0 0.78 0.93 0.85 1035
1 0.59 0.27 0.37 374
accuracy 0.76 1409
macro avg 0.69 0.60 0.61 1409
weighted avg 0.73 0.76 0.72 1409
[[965 70]
[273 101]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| lightgbm | 0.7566 | 0.5906 | 0.2701 | 0.3706 | 0.6012 |
ypreds[:5]
array([0, 0, 0, 0, 0])
show_methods(model)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | best_iteration_ | fit | n_estimators | reg_alpha |
| 1 | best_score_ | get_params | n_features_ | reg_lambda |
| 2 | booster_ | importance_type | n_jobs | score |
| 3 | boosting_type | learning_rate | num_leaves | set_params |
| 4 | class_weight | max_depth | objective | silent |
| 5 | classes_ | min_child_samples | objective_ | subsample |
| 6 | colsample_bytree | min_child_weight | predict | subsample_for_bin |
| 7 | evals_result_ | min_split_gain | predict_proba | subsample_freq |
| 8 | feature_importances_ | n_classes_ | random_state |
e = model.evals_result_
# print(e)
out = """
{'valid_0': OrderedDict([
('binary_logloss', [0.5495546455956963,...],
('profit', [-59800, -59800,...]
"""
k0 = list(e.keys())[0]
k1 = list(e[k0].keys())[0]
print(e[k0][k1][:2])
#n_used = len(e['valid_0']['binary_logloss']) # only these trees are used
n_used = len(e[k0][k1])
print('early stop used: ', n_used)
[0.5495546455956963, 0.5287290543184567] early stop used: 48
params = {'boosting_type': 'gbdt',
'random_state': 100,
'n_jobs': -1,
'learning_rate': 0.02,
'max_depth': 3,
# 'n_estimators': 500, # same as num_boost_round
'reg_alpha': 0.01,
'reg_lambda': 0.02,
'scale_pos_weight': 4,
'subsample': 1.0}
bst = lgb.train(params,
dtrain,
num_boost_round=10,
valid_sets=dvalid,
verbose_eval=0,
)
print(f'Finished {bst.current_iteration()} rounds')
yprobs = bst.predict(df_Xtest)
ypreds = np.rint(yprobs)
profit = get_profit(ytest,ypreds)
print(skmetrics.confusion_matrix(ytest,ypreds))
print(f'profit = ${profit:,.0f}')
Finished 10 rounds [[1035 0] [ 374 0]] profit = $-74,800
show_methods(bst)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | add_valid | feature_importance | name_valid_sets | reset_parameter |
| 1 | attr | feature_name | network | rollback_one_iter |
| 2 | best_iteration | free_dataset | num_feature | save_model |
| 3 | best_score | free_network | num_model_per_iteration | set_attr |
| 4 | current_iteration | get_leaf_output | num_trees | set_network |
| 5 | dump_model | get_split_value_histogram | pandas_categorical | set_train_data_name |
| 6 | eval | handle | params | shuffle_models |
| 7 | eval_train | model_from_string | predict | update |
| 8 | eval_valid | model_to_string | refit |
# bst.feature_name()
print('Saving model...')
path_lgb_bst = '../artifacts/lgb_bst.joblib'
joblib.dump(bst, path_lgb_bst)
Saving model...
['../artifacts/lgb_bst.joblib']
bst_old = joblib.load(path_lgb_bst)
# continue training
bst = lgb.train(params,
dtrain,
num_boost_round=10,
init_model=bst_old,
valid_sets=dvalid,
verbose_eval=0,
)
print(f'Finished {bst.current_iteration()} rounds')
print(bst.current_iteration())
Finished 20 rounds 20
# change other parameters during training
bst = lgb.train(params,
dtrain,
num_boost_round=10,
init_model=bst,
valid_sets=dvalid,
verbose_eval=0,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] *5)])
print(f'Finished {bst.current_iteration()} rounds')
print(bst.current_iteration())
Finished 30 rounds 30
# custom eval metric
evals_result = {}
bst = lgb.train(params,
dtrain,
num_boost_round=10,
init_model=bst,
valid_sets=[dvalid,dtrain],
valid_names=['valid','train'],
verbose_eval=0,
feval=metric_profit_bst,
evals_result = evals_result,
)
print(f'Finished {bst.current_iteration()} rounds')
Finished 40 rounds
lgb.plot_metric(evals_result, metric=metric_profit_name)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe3d688f050>
print(f'train : {df_Xtrain.shape[0]:,d}')
print(f'valid : {df_Xvalid.shape[0]:,d}')
print(f'test : {df_Xtest.shape[0]:,d}')
print(bst.best_iteration,'\n')
print(bst.best_score,'\n')
print(bst.params)
train : 4,507
valid : 1,127
test : 1,409
0
defaultdict(<class 'collections.OrderedDict'>, {'train': OrderedDict([('profit', 147300)]), 'valid': OrderedDict([('profit', 36800)])})
{'boosting_type': 'gbdt', 'random_state': 100, 'n_jobs': -1, 'learning_rate': 0.02, 'max_depth': 3, 'reg_alpha': 0.01, 'reg_lambda': 0.02, 'scale_pos_weight': 4, 'subsample': 1.0}
yprobs = bst.predict(df_Xtest)
ypreds = np.array([1 if i> 0.5 else 0 for i in yprobs])
profit = get_profit(ytest,ypreds)
print(skmetrics.confusion_matrix(ytest,ypreds))
print(f'profit = ${profit:,.0f}')
[[1023 12] [ 335 39]] profit = $-52,600
--------------------------- lgb.cv
lgb.cv(
params,
train_set,
num_boost_round = 100,
folds = None,
nfold = 5,
stratified = True,
shuffle = True,
metrics = None,
fobj = None,
feval = None,
init_model = None,
feature_name = 'auto',
categorical_feature = 'auto',
early_stopping_rounds = None,
fpreproc = None,
verbose_eval = None,
show_stdv = True,
seed = 0,
callbacks = None,
eval_train_metric = False,
)
params['num_boost_round'] = 500
dict_cv_result = lgb.cv(
params,
dtrain_full,
nfold=5,
metrics='auc',
feval=metric_profit_bst,
early_stopping_rounds=None,
stratified=True,
seed=SEED,
show_stdv=True,
)
print(dict_cv_result.keys())
profits = dict_cv_result[metric_profit_name + '-mean']
print(f'number of rounds used : {len(profits)}')
print('first two values of profits', profits[:2])
print('Best CV score:', dict_cv_result['profit-mean'][-1])
note = """
1. When I use early_stopping_rounds, then i get only one profit even though round=500
2. If I use early_stopping_rounds=1000 and num_boost_round=500,
still, I got only 34 rounds.
This means to get all num_boost_round we must use early_stopping_rounds=None.
(greater value will not work.)
Note: (no early stopping)
- auc fisrt little increase, then decrease.
- profit-mean is same for all
"""
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/lightgbm/engine.py:503: UserWarning: Found `num_boost_round` in params. Will use it instead of argument
dict_keys(['auc-mean', 'auc-stdv', 'profit-mean', 'profit-stdv']) number of rounds used : 500 first two values of profits [36820.0, 36820.0] Best CV score: 49240.0
print(dict_cv_result['auc-mean'][:2], dict_cv_result['auc-stdv'][:2],
dict_cv_result['profit-mean'][:2], dict_cv_result['profit-stdv'][:2])
[0.8215705207084871, 0.82239542064941] [0.009881347078228762, 0.010764002682597588] [36820.0, 36820.0] [40.0, 40.0]
# we are interested only in last boosting round value
print(dict_cv_result['auc-mean'][-1], dict_cv_result['auc-stdv'][-1],
dict_cv_result['profit-mean'][-1], dict_cv_result['profit-stdv'][-1])
0.8486778953911391 0.009447300605339324 49240.0 1635.3592877407705
fmin(
fn,
space,
algo,
max_evals = 9223372036854775807,
timeout = None,
loss_threshold = None,
trials = None,
rstate = None,
allow_trials_fmin = True,
pass_expr_memo_ctrl = None,
catch_eval_exceptions = False,
verbose = True,
return_argmin = True,
points_to_evaluate = None,
max_queue_len = 1,
show_progressbar = True,
import hyperopt
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
from pathlib import Path
path_trials_lgb = '../artifacts/lgb_trials_hyperopt.joblib'
if Path(path_trials_lgb).exists():
trials = joblib.load(path_trials_lgb)
else:
trials = hyperopt.Trials()
show_methods(hp)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | absolute_import | normal | qloguniform | randint |
| 1 | choice | pchoice | qnormal | uniform |
| 2 | lognormal | qlognormal | quniform | uniformint |
| 3 | loguniform |
# https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
# https://indico.cern.ch/event/617754/contributions/2590694/attachments/1459648/2254154/catboost_for_CMS.pdf
space_lgb_hyperopt = {
'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
'max_depth' : scope.int(hp.quniform('max_depth', 2, 32, 1)),
'n_estimators' : scope.int(hp.quniform('n_estimators', 100, 5000, 50)),
'max_bin' : scope.int(hp.quniform('max_bin', 8, 512, 1)),
'min_data_in_leaf' : scope.int(hp.quniform('min_data_in_leaf', 1, 512, 1)),
'min_data_in_bin' : scope.int(hp.quniform('min_data_in_bin', 1, 512, 1)),
'scale_pos_weight' : hp.randint('scale_pos_weight',1,20),
#'class_weight': hp.choice('class_weight', [None, 'balanced']),
'feature_fraction': hp.uniform('feature_fraction',0.4, 1.0),
'subsample' : hp.uniform ('subsample', 0.5, 1), # bagging_fraction
# reg_alpha and reg_lambda are lambda_l1 and lambda_l2
'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
}
# lgb.LGBMClassifier?
INT_PARAMS = ['n_estimators','num_boost_round','num_leaves',
'max_depth','max_bin',
'min_data_in_leaf','min_data_in_bin']
def lgb_objective_hyperopt(params):
global INT_PARAMS
for int_param in INT_PARAMS:
# make integer if exist
if int_param in params:
params[int_param] = int(params[int_param])
# Cross validation
dict_cv_result = lgb.cv(
params,
dtrain_full,
nfold=5,
metrics='auc',
feval=metric_profit_bst,
early_stopping_rounds=100,
stratified=True,
seed=SEED,
)
# update boosting round
profit = dict_cv_result['profit-mean'][-1]
profit_std = dict_cv_result['profit-stdv'][-1] # note: stdv not std
auc = dict_cv_result['auc-mean'][-1]
auc_std = dict_cv_result['auc-stdv'][-1]
num_best_round = len(dict_cv_result['auc-mean'])
# loss must be minimized, so we may need to use -ve sign.
return {'loss' : -profit, 'status': hyperopt.STATUS_OK,
'profit': profit, 'profit_std': profit_std,
'auc' : auc, 'auc_std' : auc_std,
'num_best_round': num_best_round
}
import warnings
max_evals = 2
with warnings.catch_warnings():
warnings.simplefilter("ignore")
params_best = hyperopt.fmin(fn=lgb_objective_hyperopt,
space=space_lgb_hyperopt,
algo=tpe.suggest,
max_evals=max_evals,
timeout= None,
trials=trials,
verbose=0,
show_progressbar=True,
rstate=np.random.RandomState(SEED)
)
params_best = hyperopt.space_eval(space_lgb_hyperopt, params_best)
joblib.dump(trials, path_trials_lgb)
print(params_best,'\n')
# model evaluation
THRESHOLD = 0.3
bst = lgb.train(params_best,dtrain,verbose_eval=0)
vdprobs = bst.predict(df_Xvalid)
vdpreds = np.array([1 if i > THRESHOLD else 0 for i in vdprobs])
profit = get_profit(yvalid,vdpreds)
print(skmetrics.confusion_matrix(yvalid,vdpreds))
print(f'valid profit = ${profit:,.0f}\n')
bst = lgb.train(params_best,dtrain_full,verbose_eval=0)
yprobs = bst.predict(df_Xtest)
ypreds = np.array([1 if i > THRESHOLD else 0 for i in yprobs])
profit = get_profit(ytest,ypreds)
print(skmetrics.confusion_matrix(ytest,ypreds))
print(f'test profit = ${profit:,.0f}')
{'feature_fraction': 0.5077406523230918, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.4403287041362657, 'max_bin': 9, 'max_depth': 17, 'min_child_weight': 0.0027662203453665685, 'min_data_in_bin': 146, 'min_data_in_leaf': 217, 'min_gain_to_split': 1.32, 'n_estimators': 1500, 'scale_pos_weight': 15, 'subsample': 0.8401980141509693}
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
[[821 7] [279 20]] valid profit = $-48,500 [[767 268] [ 75 299]] test profit = $77,800
N_TRIALS = 100
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for _ in trange(N_TRIALS):
params_best = hyperopt.fmin(fn=lgb_objective_hyperopt,
space=space_lgb_hyperopt,
algo=tpe.suggest,
max_evals=1,
timeout= None,
trials=trials, # we use previous trials, so we can use max_evals=1
verbose=0,
show_progressbar=True,
rstate=np.random.RandomState(SEED)
)
params_best = hyperopt.space_eval(space_lgb_hyperopt, params_best)
joblib.dump(trials, path_trials_lgb)
print('\n Best parameters')
print(params_best,'\n')
print('\n Best validation results')
print(trials.best_trial['result'])
print()
# model evaluation
THRESHOLD = 0.3
best_rounds = trials.best_trial['result']['num_best_round']
params_best['n_estimators'] = best_rounds
bst = lgb.train(params_best,dtrain,verbose_eval=0)
vdprobs = bst.predict(df_Xvalid)
vdpreds = np.array([1 if i > THRESHOLD else 0 for i in vdprobs])
profit = get_profit(yvalid,vdpreds)
print(skmetrics.confusion_matrix(yvalid,vdpreds))
print(f'valid profit = ${profit:,.0f}\n')
bst = lgb.train(params_best,dtrain_full,verbose_eval=0)
yprobs = bst.predict(df_Xtest)
ypreds = np.array([1 if i > THRESHOLD else 0 for i in yprobs])
profit = get_profit(ytest,ypreds)
print(skmetrics.confusion_matrix(ytest,ypreds))
print(f'test profit = ${profit:,.0f}')
100%|██████████| 1000/1000 [00:59<00:00, 16.91it/s] /Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
Best parameters
{'feature_fraction': 0.5077406523230918, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.4403287041362657, 'max_bin': 9, 'max_depth': 17, 'min_child_weight': 0.0027662203453665685, 'min_data_in_bin': 146, 'min_data_in_leaf': 217, 'min_gain_to_split': 1.32, 'n_estimators': 1500, 'scale_pos_weight': 15, 'subsample': 0.8401980141509693}
Best validation results
{'loss': -68280.0, 'status': 'ok', 'profit': 68280.0, 'profit_std': 1452.4462124292245, 'auc': 0.7345205389275101, 'auc_std': 0.007579597601982307, 'num_best_round': 101}
[[821 7]
[279 20]]
valid profit = $-48,500
[[767 268]
[ 75 299]]
test profit = $77,800
def lgb_objective_hyperopt_skf(params):
global INT_PARAMS, CURRENT_BEST
for int_param in INT_PARAMS:
# make integer if exist
if int_param in params:
params[int_param] = int(params[int_param])
# skf is more time-consuming but more stable.
skf = StratifiedKFold(n_splits=5,random_state=SEED,shuffle=True)
profits = []
aucs = []
num_best_rounds = []
for idx_tr, idx_vd in skf.split(df_Xtrain_full, ser_ytrain_full):
Xtr,Xvd = df_Xtrain_full.iloc[idx_tr], df_Xtrain_full.iloc[idx_vd]
ytr,yvd = ser_ytrain_full[idx_tr], ser_ytrain_full.iloc[idx_vd]
model = LGBMClassifier(random_state=SEED,**params)
model.fit(Xtr, ytr,
eval_set=[(Xvd, yvd)],
verbose=0,
early_stopping_rounds=100)
# get best round
e = model.evals_result_
k0 = list(e.keys())[0]
k1 = list(e[k0].keys())[0]
num_best_round = len(e[k0][k1])
num_best_rounds.append(num_best_round)
# model predictions
vdpreds = model.predict(Xvd)
yvd = yvd.to_numpy().ravel()
auc_now = skmetrics.roc_auc_score(yvd,vdpreds)
aucs.append(auc_now)
profit_now = get_profit(yvd,vdpreds)
profits.append(profit_now)
#=============================================================
profit = np.mean(profits)
profit_std = np.std(profits)
auc = np.mean(aucs)
auc_std = np.std(aucs)
num_best_round = np.max(num_best_rounds)
# debug
print(f'profit : {profit:,.0f} auc : {auc:,.4f}')
print(params)
# loss must be minimized, so we may need to use -ve sign.
return {'loss' : -profit, 'status': hyperopt.STATUS_OK,
'profit': profit, 'profit_std': profit_std,
'auc' : auc, 'auc_std' : auc_std,
'num_best_round': num_best_round
}
N_TRIALS = 1_00
params_best = hyperopt.fmin(fn=lgb_objective_hyperopt_skf,
space=space_lgb_hyperopt,
algo=tpe.suggest,
max_evals=N_TRIALS,
timeout= None,
trials=trials,
verbose=10,
show_progressbar=True,
rstate=np.random.RandomState(SEED)
)
params_best = hyperopt.space_eval(space_lgb_hyperopt, params_best)
joblib.dump(trials, path_trials_lgb)
100%|██████████| 100/100 [00:00<?, ?trial/s, best loss=?]
['../artifacts/lgb_trials_hyperopt.joblib']
print('\n Best parameters')
print(params_best,'\n')
print('\n Best validation results')
print(trials.best_trial['result'])
print()
# model evaluation
THRESHOLD = 0.5
best_rounds = trials.best_trial['result']['num_best_round']
# params_best['n_estimators'] = best_rounds
#============================ model eval on valid set
model = lgb.LGBMClassifier(**params_best, random_state=SEED)
model.fit(df_Xtrain, ytrain)
# vdprobs2d = model.predict_proba(df_Xvalid)
# vdpreds = np.array([1 if i[1] > THRESHOLD else 0 for i in vdprobs2d])
vdpreds = model.predict(df_Xvalid)
profit = get_profit(yvalid,vdpreds)
print(skmetrics.confusion_matrix(yvalid,vdpreds))
print(f'valid profit = ${profit:,.0f}\n')
#========================== model eval on test set
model = lgb.LGBMClassifier(**params_best, random_state=SEED)
model.fit(df_Xtrain_full, ytrain_full)
# yprobs2d = model.predict_proba(df_Xtest)
# ypreds = np.array([1 if i[1] > THRESHOLD else 0 for i in yprobs2d])
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
print(skmetrics.confusion_matrix(ytest,ypreds))
print(f'test profit = ${profit:,.0f}')
Best parameters
{'feature_fraction': 0.5077406523230918, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.4403287041362657, 'max_bin': 9, 'max_depth': 17, 'min_child_weight': 0.0027662203453665685, 'min_data_in_bin': 146, 'min_data_in_leaf': 217, 'min_gain_to_split': 1.32, 'n_estimators': 1500, 'scale_pos_weight': 15, 'subsample': 0.8401980141509693}
Best validation results
{'loss': -68280.0, 'status': 'ok', 'profit': 68280.0, 'profit_std': 1452.4462124292245, 'auc': 0.7345205389275101, 'auc_std': 0.007579597601982307, 'num_best_round': 101}
[[316 512]
[ 7 292]]
valid profit = $64,200
[[385 650]
[ 18 356]]
test profit = $73,800
print(list(trials.trials[0]['misc']['vals'].keys()))
['feature_fraction', 'lambda_l1', 'lambda_l1_positive', 'lambda_l2', 'lambda_l2_positive', 'learning_rate', 'max_bin', 'max_depth', 'min_child_weight', 'min_data_in_bin', 'min_data_in_leaf', 'min_gain_to_split', 'n_estimators', 'scale_pos_weight', 'subsample']
arr_trials =[[x['result']['loss'],
x['misc']['vals']['learning_rate'][0],
x['misc']['vals']['max_depth'][0],
x['misc']['vals']['n_estimators'][0]] for x in trials.trials]
df_trials=pd.DataFrame(arr_trials,
columns=['score', 'learning_rate', 'max_depth',
'n_estimators'])
df_trials.plot(subplots=True,figsize=(10, 10));
trials.best_trial['result']
{'loss': -68280.0,
'status': 'ok',
'profit': 68280.0,
'profit_std': 1452.4462124292245,
'auc': 0.7345205389275101,
'auc_std': 0.007579597601982307,
'num_best_round': 101}
# do not use lgb.train but use model.fit
bst = lgb.train(params_best,dtrain_full,verbose_eval=0)
yprobs = bst.predict(df_Xtest)
ypreds = np.round(yprobs).astype(int)
profit = get_profit(ytest,ypreds)
print(skmetrics.confusion_matrix(ytest,ypreds))
print(f'profit = ${profit:,.0f}')
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
[[931 104] [188 186]] profit = $26,400
# params_best = {'feature_fraction': 0.5077406523230918,
# 'lambda_l1': 0,
# 'lambda_l2': 0,
# 'learning_rate': 0.4403287041362657,
# 'max_bin': 9,
# 'max_depth': 17,
# 'min_child_weight': 0.0027662203453665685,
# 'min_data_in_bin': 146,
# 'min_data_in_leaf': 217,
# 'min_gain_to_split': 1.32,
# 'n_estimators': 1500,
# 'scale_pos_weight': 15,
# 'subsample': 0.8401980141509693}
model = lgb.LGBMClassifier(random_state=SEED,**params_best)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
model_eval_bin(f'{model_name}+optuna',ytest,ypreds,yprobs2d,show_plots=True)
print(f"test profit = ${profit:,d}")
precision recall f1-score support
0 0.96 0.37 0.54 1035
1 0.35 0.95 0.52 374
accuracy 0.53 1409
macro avg 0.65 0.66 0.53 1409
weighted avg 0.80 0.53 0.53 1409
[[385 650]
[ 18 356]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| lightgbm+optuna | 0.5259 | 0.3539 | 0.9519 | 0.5159 | 0.6619 |
test profit = $73,800
Function plot_roc_curve is deprecated; This will be removed in v0.5.0. Please use scikitplot.metrics.plot_roc instead.
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
Setting feature_perturbation = "tree_path_dependent" because no background data was given. LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
df_Xtest.head(2)
| tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | SeniorCitizen_No | SeniorCitizen_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Contract_Month-to-month | Contract_One year | Contract_Two year | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | PhoneService_No | PhoneService_Yes | PaperlessBilling_No | PaperlessBilling_Yes | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | Dependents_No | Dependents_Yes | Partner_No | Partner_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 48.6 | 48.6 | 3683.643192 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 1 | 56 | 99.9 | 5706.3 | 1370.923131 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
shap_values[idx,:],
df_Xtest.iloc[idx,:],
matplotlib=False,
text_rotation=90)
# for this row, the predicted label is ...
# red features makes it higher
# blue features makes it smaller.
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-83-0f954fe3c3ee> in <module> 3 idx = 0 4 shap.force_plot(explainer.expected_value, ----> 5 shap_values[idx,:], 6 df_Xtest.iloc[idx,:], 7 matplotlib=False, TypeError: list indices must be integers or slices, not tuple
shap.summary_plot(shap_values, df_Xtest)
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot(ind='TotalCharges', interaction_index='tenure',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))